"""
转换成BERT词向量，768维，最大长度30
启动命令：
bert-serving-start -model_dir D:/BERT/chinese_L-12_H-768_A-12 -num_worker=2 -max_seq_len=30 -pooling_strategy=NONE
参数详情见：https://github.com/hanxiao/bert-as-service
"""
from bert_serving.client import BertClient
import numpy as np
import os


# 处理train_x dev_x
train_data_path = r'./data/txt/train_xQ.txt'
dev_data_path = r'./data/txt/dev_xQ.txt'
test_data_path = r'./data/txt/test_x.txt'
paths = [train_data_path, dev_data_path]
if not os.path.exists(r'./data/npy/'):
    os.makedirs(r'./data/npy')

bc = BertClient()

# 训练集和验证集
for p in paths:
    for i in range(5):
        path = p.replace('Q', str(i))
        x = []
        with open(path, 'r', encoding='utf-8') as f:
            for line in f.readlines():
                x.append(line.strip().replace('\n', ''))
        # 转换BERT向量
        x = bc.encode(x)
        print(x.shape)
        # 保存
        np.save(path.replace('txt', 'npy'), x)

# 测试集
x = []
with open(test_data_path, 'r', encoding='utf-8') as f:
    for line in f.readlines():
        x.append(line.strip())
# 转换BERT向量
x = bc.encode(x)
print(x.shape)
# 保存
np.save(test_data_path.replace('txt', 'npy'), x)
